## Data Cleaning

# load raw data files
data = read.csv("../data/filledDatabase111119NUMONLY.csv")

# clean data 
data = clean_data(data)

# separate compound and group_cate from the predictors
compound = data$Compound
group_cat = data$GroupCat
X = data$X

group_cat_text = paste("Grp", group_cat)
data = select(data, -c("Compound","GroupCat","X"))
summary(X) %>% 
  as.data.frame() %>%
  set_colnames("Frequency") %>%
  rownames_to_column("X") %>%
  arrange(desc(Frequency)) %>% 
  t() %>%
  kable(caption = "Frequency table for cluster X of data version 11/11/19") %>% 
  kable_styling(bootstrap_options = "striped")
Table 1: Frequency table for cluster X of data version 11/11/19
X O F Cl Br I S Se l Te N
Frequency 259 71 50 25 21 7 4 3 1 0

Cluster O

PCA circle

# data slicing 
rows_to_take = X == "O"

subset = data[rows_to_take, ]
compound_sub = compound[rows_to_take]
group_cat_sub = group_cat[rows_to_take]
group_cat_text_sub = group_cat_text[rows_to_take]

# variables in PC space
fviz_pca_var(
  prcomp(subset, scale = TRUE),
  col.var = "contrib", # Color by contributions to the PC
  gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
  repel = TRUE     # Avoid text overlapping
)
Cluster O: vectors of predictors in the space of PC1 and PC2

Figure 1: Cluster O: vectors of predictors in the space of PC1 and PC2

Compounds in space of PC’s

set_color = c("#0071C3","#DE501A","#EEB020","#7E2E8E","#79AC2C","#4DBDF7","#A51331") %>% 
  rep(10)

# extract three two PC's
data_pca = get_pc_space(subset, k = 3) %>% scale()

# plot
data.frame(Compound = compound_sub, GroupCat = group_cat_text_sub, data_pca) %>%
  ggplot(aes(x=PC1, y=PC2, color = GroupCat)) +
  geom_point(aes(color = GroupCat), size = 2, alpha = 0.4) +
  geom_text(aes(label=Compound, color=GroupCat), size = 3) +
  scale_color_manual(values=set_color) +
  scale_fill_manual(values=set_color) +
  scale_shape_manual(values=1:11) +
  theme_minimal()
Cluster O: compounds in the space of the first two PC's

Figure 2: Cluster O: compounds in the space of the first two PC’s

PCA biplot

rownames(subset) = make.names(compound_sub, unique=TRUE)
fit <- prcomp(subset, scale = TRUE)
fviz_pca_biplot(fit, aesx = c(1,2),
                # individual
                label = "var", labelsize = 4,
                geom = c("point","text"), fill.ind = group_cat_text_sub, alpha.ind = 0.7,
                pointsize = 2, pointshape = 21, palette = set_color[1:11],
                # variable
                col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel=TRUE) +
  labs(fill = "Group Cat", color = "Contrib")
Cluster O: compounds and predictor vectors in the space of the first two PC's

Figure 3: Cluster O: compounds and predictor vectors in the space of the first two PC’s

Cluster F

PCA circle

# data slicing 
rows_to_take = X == "F"

subset = data[rows_to_take, ]
compound_sub = compound[rows_to_take]
group_cat_sub = group_cat[rows_to_take]
group_cat_text_sub = group_cat_text[rows_to_take]

# variables in PC space
fviz_pca_var(
  prcomp(subset, scale = TRUE),
  col.var = "contrib", # Color by contributions to the PC
  gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
  repel = TRUE     # Avoid text overlapping
)
Cluster F: vectors of predictors in the space of PC1 and PC2

Figure 4: Cluster F: vectors of predictors in the space of PC1 and PC2

Compounds in space of PC’s

# extract three two PC's
data_pca = get_pc_space(subset, k = 3) %>% scale()

# plot
data.frame(Compound = compound_sub, GroupCat = group_cat_text_sub, data_pca) %>%
  ggplot(aes(x=PC1, y=PC2, color = GroupCat)) +
  geom_point(aes(color = GroupCat), size = 2, alpha = 0.4) +
  geom_text(aes(label=Compound, color=GroupCat), size = 3) +
  scale_color_manual(values=set_color) +
  scale_fill_manual(values=set_color) +
  scale_shape_manual(values=1:11) +
  theme_minimal()
Cluster F: compounds in the space of the first two PC's

Figure 5: Cluster F: compounds in the space of the first two PC’s

PCA biplot

rownames(subset) = make.names(compound_sub, unique=TRUE)
fit <- prcomp(subset, scale = TRUE)
fviz_pca_biplot(fit, aesx = c(1,2),
                # individual
                label = "var", labelsize = 4,
                geom = c("point","text"), fill.ind = group_cat_text_sub, alpha.ind = 0.7,
                pointsize = 2, pointshape = 21, palette = set_color[1:11],
                # variable
                col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel=TRUE) +
  labs(fill = "Group Cat", color = "Contrib")
Cluster F: compounds and predictor vectors in the space of the first two PC's

Figure 6: Cluster F: compounds and predictor vectors in the space of the first two PC’s

Cluster Cl

PCA circle

# data slicing 
rows_to_take = X == "Cl"

subset = data[rows_to_take, ]
compound_sub = compound[rows_to_take]
group_cat_sub = group_cat[rows_to_take]
group_cat_text_sub = group_cat_text[rows_to_take]

# variables in PC space
fviz_pca_var(
  prcomp(subset, scale = TRUE),
  col.var = "contrib", # Color by contributions to the PC
  gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
  repel = TRUE     # Avoid text overlapping
)
Cluster Cl: vectors of predictors in the space of PC1 and PC2

Figure 7: Cluster Cl: vectors of predictors in the space of PC1 and PC2

Compounds in space of PC’s

# extract three two PC's
data_pca = get_pc_space(subset, k = 3) %>% scale()

# plot
data.frame(Compound = compound_sub, GroupCat = group_cat_text_sub, data_pca) %>%
  ggplot(aes(x=PC1, y=PC2, color = GroupCat)) +
  geom_point(aes(color = GroupCat), size = 2, alpha = 0.4) +
  geom_text(aes(label=Compound, color=GroupCat), size = 3) +
  scale_color_manual(values=set_color) +
  scale_fill_manual(values=set_color) +
  scale_shape_manual(values=1:11) +
  theme_minimal()
Cluster Cl: compounds in the space of the first two PC's

Figure 8: Cluster Cl: compounds in the space of the first two PC’s

PCA biplot

rownames(subset) = make.names(compound_sub, unique=TRUE)
fit <- prcomp(subset, scale = TRUE)
fviz_pca_biplot(fit, aesx = c(1,2),
                # individual
                label = "var", labelsize = 4,
                geom = c("point","text"), fill.ind = group_cat_text_sub, alpha.ind = 0.7,
                pointsize = 2, pointshape = 21, palette = set_color[1:11],
                # variable
                col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel=TRUE) +
  labs(fill = "Group Cat", color = "Contrib")
Cluster Cl: compounds and predictor vectors in the space of the first two PC's

Figure 9: Cluster Cl: compounds and predictor vectors in the space of the first two PC’s

Cluster Br

PCA circle

# data slicing 
rows_to_take = X == "Br"

subset = data[rows_to_take, ]
compound_sub = compound[rows_to_take]
group_cat_sub = group_cat[rows_to_take]
group_cat_text_sub = group_cat_text[rows_to_take]

# variables in PC space
fviz_pca_var(
  prcomp(subset, scale = TRUE),
  col.var = "contrib", # Color by contributions to the PC
  gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
  repel = TRUE     # Avoid text overlapping
)
Cluster Br: vectors of predictors in the space of PC1 and PC2

Figure 10: Cluster Br: vectors of predictors in the space of PC1 and PC2

Compounds in space of PC’s

# extract three two PC's
data_pca = get_pc_space(subset, k = 3) %>% scale()

# plot
data.frame(Compound = compound_sub, GroupCat = group_cat_text_sub, data_pca) %>%
  ggplot(aes(x=PC1, y=PC2, color = GroupCat)) +
  geom_point(aes(color = GroupCat), size = 2, alpha = 0.4) +
  geom_text(aes(label=Compound, color=GroupCat), size = 3) +
  scale_color_manual(values=set_color) +
  scale_fill_manual(values=set_color) +
  scale_shape_manual(values=1:11) +
  theme_minimal()
Cluster Br: compounds in the space of the first two PC's

Figure 11: Cluster Br: compounds in the space of the first two PC’s

PCA biplot

rownames(subset) = make.names(compound_sub, unique=TRUE)
fit <- prcomp(subset, scale = TRUE)
fviz_pca_biplot(fit, aesx = c(1,2),
                # individual
                label = "var", labelsize = 4,
                geom = c("point","text"), fill.ind = group_cat_text_sub, alpha.ind = 0.7,
                pointsize = 2, pointshape = 21, palette = set_color[1:11],
                # variable
                col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel=TRUE) +
  labs(fill = "Group Cat", color = "Contrib")
Cluster Br: compounds and predictor vectors in the space of the first two PC's

Figure 12: Cluster Br: compounds and predictor vectors in the space of the first two PC’s

Cluster I

PCA circle

# data slicing 
rows_to_take = X == "I"

subset = data[rows_to_take, ]
compound_sub = compound[rows_to_take]
group_cat_sub = group_cat[rows_to_take]
group_cat_text_sub = group_cat_text[rows_to_take]

# variables in PC space
fviz_pca_var(
  prcomp(subset, scale = TRUE),
  col.var = "contrib", # Color by contributions to the PC
  gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
  repel = TRUE     # Avoid text overlapping
)
Cluster I: vectors of predictors in the space of PC1 and PC2

Figure 13: Cluster I: vectors of predictors in the space of PC1 and PC2

Compounds in space of PC’s

# extract three two PC's
data_pca = get_pc_space(subset, k = 3) %>% scale()

# plot
data.frame(Compound = compound_sub, GroupCat = group_cat_text_sub, data_pca) %>%
  ggplot(aes(x=PC1, y=PC2, color = GroupCat)) +
  geom_point(aes(color = GroupCat), size = 2, alpha = 0.4) +
  geom_text(aes(label=Compound, color=GroupCat), size = 3) +
  scale_color_manual(values=set_color) +
  scale_fill_manual(values=set_color) +
  scale_shape_manual(values=1:11) +
  theme_minimal()
Cluster I: compounds in the space of the first two PC's

Figure 14: Cluster I: compounds in the space of the first two PC’s

PCA biplot

rownames(subset) = make.names(compound_sub, unique=TRUE)
fit <- prcomp(subset, scale = TRUE)
fviz_pca_biplot(fit, aesx = c(1,2),
                # individual
                label = "var", labelsize = 4,
                geom = c("point","text"), fill.ind = group_cat_text_sub, alpha.ind = 0.7,
                pointsize = 2, pointshape = 21, palette = set_color[1:11],
                # variable
                col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel=TRUE) +
  labs(fill = "Group Cat", color = "Contrib")
Cluster I: compounds and predictor vectors in the space of the first two PC's

Figure 15: Cluster I: compounds and predictor vectors in the space of the first two PC’s